Statement Of Contribution: Assignment 1: Jin Yan (jinya425)
Assignment 2: Dinesh Sundaramoorthy (dinsu875)

Assignment One

library(plotly)
## 载入需要的程辑包:ggplot2
## 
## 载入程辑包:'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(seriation)
library(ggplot2)
library(dplyr)
## 
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)

1.1

# 1.1
mydata = read.delim("prices-and-earnings.txt", header = TRUE)
mydata = mydata[,c(1,2,5,6,7,9,10,16,17,18,19)]
rownames = mydata[,1]
mydata = sapply(mydata[,-1],as.numeric)
rownames(mydata)=rownames

1.2

# 1.2

data_scaled = scale(mydata)
plot_ly(x=colnames(data_scaled), y=rownames(data_scaled), # x and y are names not certain columns
        z=data_scaled, type="heatmap", colors=colorRamp(c("yellow","red")))

Answer: It is impossible to see clusters and outliers.

1.3

# 1.3


# EUCLIDEAN DISTANCE

# aiming to find the order where we can see the clusters of columns
data_t = t(data_scaled)
euclidean_dist_col = dist(data_t)
cluster_col = seriate(euclidean_dist_col, method="HC")
order_col = get_order(cluster_col)

# like before, to find the order of rows.
euclidean_dist_row = dist(data_scaled)
cluster_row = seriate(euclidean_dist_row, method="HC")
order_row = get_order(cluster_row)


# get the new matrix with updated orders of rows and columns
matrix_new = data_scaled[order_row, order_col]

# plot the heatmap relevant to euclidean distance matrix
plot_ly(x=colnames(matrix_new), y=rownames(matrix_new), 
        z=matrix_new, type="heatmap", colors=colorRamp(c("yellow","red")))
# 1-CORRELATION DISTANCE

# find the order for rows
euclidean_dist_row_2 = as.dist(1-cor(data_t))
cluster_row_2 = seriate(euclidean_dist_row_2, method="HC")
order_row_2 = get_order(cluster_row_2)
 
# find the order for columns
eucliean_dist_col_2 = as.dist(1-cor(data_scaled))
cluster_col_2 = seriate(eucliean_dist_col_2, method="HC")
order_col_2 = get_order(cluster_col_2)

# get another new data matrix
matrix_new_2 = data_scaled[order_row_2, order_col_2]

# plot the heatmap relevant to 1-correlation distance matrix
plot_ly(x=colnames(matrix_new_2), y=rownames(matrix_new_2), 
        z=matrix_new_2, type="heatmap", colors=colorRamp(c("yellow","red")))

Answer: By comparison, the first plot with euclidean distance is easier to analyze. This is because through 1-correlation distance, we can only find clusters in which data samples are linear correlated, which does not mean their difference in essence. They are closer in map, just because they have similar distribution for variables. But for euclidean distance, this is not the case. In one cluster, data samples are closer to each other in terms of euclidean distance.This is what we want.

Analysis: From the euclidean plot, we can see that two big clusters. For the first cluster, Hours. Worked to Big.Mac.min variables, the value is relative high, especially for Bangkok, Delhi, and Jakarta. For another cluster, the rest of variables are higher. The difference can correspond to the economic conditions at that time.

1.4

# 1.4
# EUCLIDEAN DISTANCE

# aiming to find the order where we can see the clusters of columns
data_t = t(data_scaled)
euclidean_dist_col = dist(data_t)
cluster_col = seriate(euclidean_dist_col, method="TSP")
order_col = get_order(cluster_col)

# like before, to find the order of rows.
euclidean_dist_row = dist(data_scaled)
cluster_row = seriate(euclidean_dist_row, method="TSP")
order_row = get_order(cluster_row)


# get the new matrix with updated orders of rows and columns
matrix_new_3 = data_scaled[order_row, order_col]

# plot the heatmap relevant to euclidean distance matrix
plot_ly(x=colnames(matrix_new_3), y=rownames(matrix_new_3), 
        z=matrix_new_3, type="heatmap", colors=colorRamp(c("yellow","red")))

Answer: By comparision we can see that the plot gotten by “TSP” is better, since we can see three clusters in the plot.

# COMPARE OBJECTIVE FUNCTION VALUES

length_path_tsp = criterion(euclidean_dist_row, 
                            seriate(euclidean_dist_row, method="TSP"),
                            method = "path_length")

length_path_hc = criterion(euclidean_dist_row,
                           seriate(euclidean_dist_row, method="HC"),
                           method = "path_length")


gradient_measure_tsp = criterion(euclidean_dist_row, 
                                 seriate(euclidean_dist_row, method="TSP"),
                                 method = "gradient_raw")

gradient_measure_hc = criterion(euclidean_dist_row,
                                seriate(euclidean_dist_row, method="HC"),
                                method = "gradient_raw")

# create a matix to show the result
result = matrix(c(length_path_tsp, length_path_hc, gradient_measure_tsp, gradient_measure_hc), 
               nrow = 2, ncol = 2,
               byrow = TRUE)

colnames(result) = c("TSP","HC")
rownames(result) = c("length_path","gradient_measure")

result
##                         TSP         HC
## length_path        122.8445   138.9674
## gradient_measure 50054.0000 30980.0000

1.5

# 1.5
data_unsorted = read.delim("prices-and-earnings.txt", header = TRUE)
plot1_parallel_coordinate<- data_unsorted %>% 
  plot_ly(type="parcoords",
          
          dimensions=list(
            list(label="Food.Costs...",values=~Food.Costs...),
            list(label="iPhone.4S.hr.",values=~iPhone.4S.hr.),
            list(label="Clothing.Index",values=~Clothing.Index),
            list(label="Hours.Worked",values=~Hours.Worked),
            list(label="Wage.Net",values=~Wage.Net),
            list(label="Vacation.Days",values=~Vacation.Days),
            list(label="Big.Mac.min.",values=~Big.Mac.min.),
            list(label="Bread.kg.in.min.",values=~Bread.kg.in.min.),
            list(label="Rice.kg.in.min.",values=~Rice.kg.in.min.),
            list(label="Goods.and.Services...",values=~Goods.and.Services...)
          ))

plot1_parallel_coordinate
plot1_parallel_coordinate_color<- data_unsorted %>% 
  plot_ly(type="parcoords",
          line=list(color=~Food.Costs...),
          dimensions=list(
            list(label="Food.Costs...",values=~Food.Costs...),
            list(label="iPhone.4S.hr.",values=~iPhone.4S.hr.),
            list(label="Clothing.Index",values=~Clothing.Index),
            list(label="Hours.Worked",values=~Hours.Worked),
            list(label="Wage.Net",values=~Wage.Net),
            list(label="Vacation.Days",values=~Vacation.Days),
            list(label="Big.Mac.min.",values=~Big.Mac.min.),
            list(label="Bread.kg.in.min.",values=~Bread.kg.in.min.),
            list(label="Rice.kg.in.min.",values=~Rice.kg.in.min.),
            list(label="Goods.and.Services...",values=~Goods.and.Services...)
          ))

plot1_parallel_coordinate_color

Answer:

“Food.Cost”,“Clothing.Index”,“Wage.Net” are the variables that are important to define clusters.

In the first cluster, we can see the values for Food.Cost ranging from 186 to about 400, the values for Clothing.Index ranging from 26.7 to 92 and values for 8.1 to about 40.

By comparison, the ranges for the second cluster correspond to 420 to 720, 110 to 150, and 70 to 130 respectively.

These clusters can be interpreted. For the first cluster, most of cities are relatively underdeveloped. The prices for electronic devices like iphone are high, since the average salary level is not high compared to the price such products.

The second cluster represents the developed cities. For these cities, it is easy to buy iphone due to high salaries but goods and services prices are higher too for the same reason.

The outlier is Tokyo. Although in terms of some variables, this city should belong to the second cluster, the value in Vacation.Days is not high enough for this city.

1.6

# 1.6
# juxtaposed radars
# create a radar plots for different cities

library(plotly)
library(dplyr)
library(scales)

Ps=list()

price = as.data.frame(matrix_new) # here the matrix with new-ordered rows is used here.
nPlot=nrow(price)
price %>%
  add_rownames( var = "group" ) %>%
  mutate_each(funs(rescale), -group) -> price_radar

for (i in 1:nPlot){
  Ps[[i]] <- htmltools::tags$div(
    plot_ly(type = 'scatterpolar', 
            r=as.numeric(price_radar[i,-1]),
            theta= colnames(price_radar)[-1], 
            fill="toself")%>%
      layout(title=price_radar$group[i]), style="width: 25%;")
}

h <-htmltools::tags$div(style = "display: flex; flex-wrap: wrap", Ps)

htmltools::browsable(h)
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Answer: The first cluster: Geneva, Zurich, and Oslo. The second cluster:Manama, Rio, and São Paulo. The outlier is Caracas.

1.7

Answer: From the perspective of accuracy, radar plots are the best. In terms of simplicity, heatmap is the best.

Assignment Two

data <- read.csv("adult.csv")
colnames(data) <- c("age","workclass","fnlwgt","education","education_num","marital_status","occupation",
                    "relationship","race","sex","capital_gain","capital_loss","hours_per_week",
                    "native_country","Income_level")

2.1

Q1: Why it is problematic to analyze this plot?

In the first plot, too many points overlap each other. Different colors also cause visual confusion. It’s hard to find patterns between groups or inside one group.

Q2: What new conclusions can you make here?

For the two income levels, they both have highest density in the approximate range of 30 to 60 on the Y-axis. This may indicate that there is no particularly strong correlation between hours per week and income level.

p1 <- ggplot(data, aes(age,hours_per_week,color=Income_level))+
  labs(title="Scatter plot(Hours per Week vs age)",
       x="Age", 
       y = "Hours per week") +
  geom_point(size=1)
p1

p2<-ggplot(data, aes(age,hours_per_week,color=Income_level))+geom_point(size=1)+
  labs(title="Trellis plot(Hours per Week vs age)",
       x="Age", 
       y = "Hours per week")+
  facet_wrap(~Income_level, ncol = 2)
p2

2.2

Q1: Analyze these two plots and make conclusions.

For the first plot. we can see that the middle age is higher in the high-income group. It implies that older people have more money than younger ones. We also can see that the age of low-income group concentrate around 25 and the high-income group concentrate from 35 to 45.

The second plot shows that most never-married people’s age are under 25. The people from 30 to 50 years old have similar distribution in various marital status. Most widowed people are around 60 years old. The peak of the married-AF-spouse’s curve comes around 30 years old. Such a distribution is in line with our common sense about the marital status for people. And combined with the first plot, income may has some impacts on marital status.

p3 <- ggplot(data, aes(x=age, group=Income_level, fill=Income_level))+
  geom_density(alpha=0.5) + 
  labs(title="Density plot(age grouped by the Income level)",
       x = "Age",
       y = "Density")

p3

p4 <- ggplot(data, aes(x=age,group=Income_level,fill=Income_level))+
  geom_density(alpha=0.5)+ 
  labs(title="Trellis plot(age grouped by the Income level)",
       x = "Age",
       y = "Density")+
  facet_wrap(~marital_status)

p4

2.3

Q1: Why is it difficult to analyze the 3D-Scatter plot?

At the default zoom level, many points cover each other, and after zooming in, it goes to the detail part and we can’t find the pattern in global. The transformation from 3D to 2D in our visual system is also unreliable, and may produce misunderstanding.

Q2: Analyze the trellis plot

Among the groups, we can see that the youngest group has the most loss density, and the oldest group has the least. Other three groups are in the middle with similar density. It may because that young people lack experience in managing capital. And most capital loss happened in range 8 ~ 16 on x-axis which represent education number. It may because that the people before 12th education (education number is 8) did not know about capital management or did not have enough capital to manage.

Inside the youngest group, the education between 8 and 12 has the most loss density, education number of 12~16 has the second high density. But with the increasing of age, 12~16 caught up, became the most one, except the oldest group.

data_filtered <- filter(data, capital_loss!=0)
p5 <- data_filtered %>% plot_ly(x = ~education_num, y = ~age, z = ~capital_loss) %>%
  add_markers() %>% # for 3D plot, we also use add_makers()
  layout(scene = list(xaxis = list(title = 'Education_num'),
                      yaxis = list(title = 'Age'),
                      zaxis = list(title = 'Capital loss')))

p5
p6 <- ggplot(data_filtered, aes(x=education_num,y=capital_loss))+
  stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE)+
  scale_fill_distiller(palette= "Spectral", direction=1) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  labs(title="2d-density plot of Cap Loss vs Education_num",
       x = "Education Num", 
       y = "Capital Loss")+
  facet_wrap(~cut_number(data_filtered$age,n=6)) # change here to use different age intervals to get more plots.

p6
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

2.4

Q1: Which advantages and disadvantages you see in using Shingles?

Although using shingles can avoid boundary effects, but we did not find any advantages or disadvantages after using it in this case.

p7_a <- ggplot(data_filtered, aes(x=education_num,y=capital_loss,color=capital_loss))+
  geom_point()+
  labs(title="Trellis plot")+
  facet_wrap(~cut_number(data_filtered$age,4))

p7_a

# we need to change "data_filtered$age", "number=4", "overlap=0.1" these parameters
Agerange <- lattice::equal.count(data_filtered$age, number=4, overlap=0.1) #overlap is 10% 

L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
ggplot(L1)+geom_linerange(aes(ymin = Lower, ymax = Upper, x=Interval))

index=c()
Class=c()
for(i in 1:nrow(L)){
  Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
  ind=which(data_filtered$age>=L1$Lower[i] &data_filtered$age<=L1$Upper[i])
  index=c(index,ind)
  Class=c(Class, rep(Cl, length(ind)))
}

df4<-data_filtered[index,]
df4$Class<-as.factor(Class)

# we need to change the parameters "x=education_num,y=capital_loss, color=capital_loss"
ggplot(df4, aes(x=education_num,y=capital_loss, color=capital_loss))+
  geom_point()+
  labs(title="Trellis plot(Shingles)")+
  facet_wrap(~Class, labeller = "label_both")

Appendix

library(plotly)
library(seriation)
library(ggplot2)
library(dplyr)
library(scales)
# 1.1
mydata = read.delim("prices-and-earnings.txt", header = TRUE)
mydata = mydata[,c(1,2,5,6,7,9,10,16,17,18,19)]
rownames = mydata[,1]
mydata = sapply(mydata[,-1],as.numeric)
rownames(mydata)=rownames
# 1.2

data_scaled = scale(mydata)
plot_ly(x=colnames(data_scaled), y=rownames(data_scaled), # x and y are names not certain columns
        z=data_scaled, type="heatmap", colors=colorRamp(c("yellow","red")))
# 1.3


# EUCLIDEAN DISTANCE

# aiming to find the order where we can see the clusters of columns
data_t = t(data_scaled)
euclidean_dist_col = dist(data_t)
cluster_col = seriate(euclidean_dist_col, method="HC")
order_col = get_order(cluster_col)

# like before, to find the order of rows.
euclidean_dist_row = dist(data_scaled)
cluster_row = seriate(euclidean_dist_row, method="HC")
order_row = get_order(cluster_row)


# get the new matrix with updated orders of rows and columns
matrix_new = data_scaled[order_row, order_col]

# plot the heatmap relevant to euclidean distance matrix
plot_ly(x=colnames(matrix_new), y=rownames(matrix_new), 
        z=matrix_new, type="heatmap", colors=colorRamp(c("yellow","red")))


# 1-CORRELATION DISTANCE

# find the order for rows
euclidean_dist_row_2 = as.dist(1-cor(data_t))
cluster_row_2 = seriate(euclidean_dist_row_2, method="HC")
order_row_2 = get_order(cluster_row_2)
 
# find the order for columns
eucliean_dist_col_2 = as.dist(1-cor(data_scaled))
cluster_col_2 = seriate(eucliean_dist_col_2, method="HC")
order_col_2 = get_order(cluster_col_2)

# get another new data matrix
matrix_new_2 = data_scaled[order_row_2, order_col_2]

# plot the heatmap relevant to 1-correlation distance matrix
plot_ly(x=colnames(matrix_new_2), y=rownames(matrix_new_2), 
        z=matrix_new_2, type="heatmap", colors=colorRamp(c("yellow","red")))
# 1.4
# EUCLIDEAN DISTANCE

# aiming to find the order where we can see the clusters of columns
data_t = t(data_scaled)
euclidean_dist_col = dist(data_t)
cluster_col = seriate(euclidean_dist_col, method="TSP")
order_col = get_order(cluster_col)

# like before, to find the order of rows.
euclidean_dist_row = dist(data_scaled)
cluster_row = seriate(euclidean_dist_row, method="TSP")
order_row = get_order(cluster_row)


# get the new matrix with updated orders of rows and columns
matrix_new_3 = data_scaled[order_row, order_col]

# plot the heatmap relevant to euclidean distance matrix
plot_ly(x=colnames(matrix_new_3), y=rownames(matrix_new_3), 
        z=matrix_new_3, type="heatmap", colors=colorRamp(c("yellow","red")))
# COMPARE OBJECTIVE FUNCTION VALUES

length_path_tsp = criterion(euclidean_dist_row, 
                            seriate(euclidean_dist_row, method="TSP"),
                            method = "path_length")

length_path_hc = criterion(euclidean_dist_row,
                           seriate(euclidean_dist_row, method="HC"),
                           method = "path_length")


gradient_measure_tsp = criterion(euclidean_dist_row, 
                                 seriate(euclidean_dist_row, method="TSP"),
                                 method = "gradient_raw")

gradient_measure_hc = criterion(euclidean_dist_row,
                                seriate(euclidean_dist_row, method="HC"),
                                method = "gradient_raw")

# create a matix to show the result
result = matrix(c(length_path_tsp, length_path_hc, gradient_measure_tsp, gradient_measure_hc), 
               nrow = 2, ncol = 2,
               byrow = TRUE)

colnames(result) = c("TSP","HC")
rownames(result) = c("length_path","gradient_measure")

result
# 1.5
data_unsorted = read.delim("prices-and-earnings.txt", header = TRUE)
plot1_parallel_coordinate<- data_unsorted %>% 
  plot_ly(type="parcoords",
          
          dimensions=list(
            list(label="Food.Costs...",values=~Food.Costs...),
            list(label="iPhone.4S.hr.",values=~iPhone.4S.hr.),
            list(label="Clothing.Index",values=~Clothing.Index),
            list(label="Hours.Worked",values=~Hours.Worked),
            list(label="Wage.Net",values=~Wage.Net),
            list(label="Vacation.Days",values=~Vacation.Days),
            list(label="Big.Mac.min.",values=~Big.Mac.min.),
            list(label="Bread.kg.in.min.",values=~Bread.kg.in.min.),
            list(label="Rice.kg.in.min.",values=~Rice.kg.in.min.),
            list(label="Goods.and.Services...",values=~Goods.and.Services...)
          ))

plot1_parallel_coordinate



plot1_parallel_coordinate_color<- data_unsorted %>% 
  plot_ly(type="parcoords",
          line=list(color=~Food.Costs...),
          dimensions=list(
            list(label="Food.Costs...",values=~Food.Costs...),
            list(label="iPhone.4S.hr.",values=~iPhone.4S.hr.),
            list(label="Clothing.Index",values=~Clothing.Index),
            list(label="Hours.Worked",values=~Hours.Worked),
            list(label="Wage.Net",values=~Wage.Net),
            list(label="Vacation.Days",values=~Vacation.Days),
            list(label="Big.Mac.min.",values=~Big.Mac.min.),
            list(label="Bread.kg.in.min.",values=~Bread.kg.in.min.),
            list(label="Rice.kg.in.min.",values=~Rice.kg.in.min.),
            list(label="Goods.and.Services...",values=~Goods.and.Services...)
          ))

plot1_parallel_coordinate_color
# 1.6
# juxtaposed radars
# create a radar plots for different cities

library(plotly)
library(dplyr)
library(scales)

Ps=list()

price = as.data.frame(matrix_new) # here the matrix with new-ordered rows is used here.
nPlot=nrow(price)
price %>%
  add_rownames( var = "group" ) %>%
  mutate_each(funs(rescale), -group) -> price_radar

for (i in 1:nPlot){
  Ps[[i]] <- htmltools::tags$div(
    plot_ly(type = 'scatterpolar', 
            r=as.numeric(price_radar[i,-1]),
            theta= colnames(price_radar)[-1], 
            fill="toself")%>%
      layout(title=price_radar$group[i]), style="width: 25%;")
}

h <-htmltools::tags$div(style = "display: flex; flex-wrap: wrap", Ps)

htmltools::browsable(h)
data <- read.csv("adult.csv")
colnames(data) <- c("age","workclass","fnlwgt","education","education_num","marital_status","occupation",
                    "relationship","race","sex","capital_gain","capital_loss","hours_per_week",
                    "native_country","Income_level")

p1 <- ggplot(data, aes(age,hours_per_week,color=Income_level))+
  labs(title="Scatter plot(Hours per Week vs age)",
       x="Age", 
       y = "Hours per week") +
  geom_point(size=1)
p1

p2<-ggplot(data, aes(age,hours_per_week,color=Income_level))+geom_point(size=1)+
  labs(title="Trellis plot(Hours per Week vs age)",
       x="Age", 
       y = "Hours per week")+
  facet_wrap(~Income_level, ncol = 2)
p2
p3 <- ggplot(data, aes(x=age, group=Income_level, fill=Income_level))+
  geom_density(alpha=0.5) + 
  labs(title="Density plot(age grouped by the Income level)",
       x = "Age",
       y = "Density")

p3

p4 <- ggplot(data, aes(x=age,group=Income_level,fill=Income_level))+
  geom_density(alpha=0.5)+ 
  labs(title="Trellis plot(age grouped by the Income level)",
       x = "Age",
       y = "Density")+
  facet_wrap(~marital_status)

p4
data_filtered <- filter(data, capital_loss!=0)
p5 <- data_filtered %>% plot_ly(x = ~education_num, y = ~age, z = ~capital_loss) %>%
  add_markers() %>% # for 3D plot, we also use add_makers()
  layout(scene = list(xaxis = list(title = 'Education_num'),
                      yaxis = list(title = 'Age'),
                      zaxis = list(title = 'Capital loss')))

p5

p6 <- ggplot(data_filtered, aes(x=education_num,y=capital_loss))+
  stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE)+
  scale_fill_distiller(palette= "Spectral", direction=1) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  labs(title="2d-density plot of Cap Loss vs Education_num",
       x = "Education Num", 
       y = "Capital Loss")+
  facet_wrap(~cut_number(data_filtered$age,n=6)) # change here to use different age intervals to get more plots.

p6
p7_a <- ggplot(data_filtered, aes(x=education_num,y=capital_loss,color=capital_loss))+
  geom_point()+
  labs(title="Trellis plot")+
  facet_wrap(~cut_number(data_filtered$age,4))

p7_a
# we need to change "data_filtered$age", "number=4", "overlap=0.1" these parameters
Agerange <- lattice::equal.count(data_filtered$age, number=4, overlap=0.1) #overlap is 10% 

L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
ggplot(L1)+geom_linerange(aes(ymin = Lower, ymax = Upper, x=Interval))

index=c()
Class=c()
for(i in 1:nrow(L)){
  Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
  ind=which(data_filtered$age>=L1$Lower[i] &data_filtered$age<=L1$Upper[i])
  index=c(index,ind)
  Class=c(Class, rep(Cl, length(ind)))
}

df4<-data_filtered[index,]
df4$Class<-as.factor(Class)

# we need to change the parameters "x=education_num,y=capital_loss, color=capital_loss"
ggplot(df4, aes(x=education_num,y=capital_loss, color=capital_loss))+
  geom_point()+
  labs(title="Trellis plot(Shingles)")+
  facet_wrap(~Class, labeller = "label_both")